In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from process import read_data
from process import process_hours_columns
import plotly.graph_objects as go
In [2]:
years = list(range(2012, 2021))
folder_path = '../data/overtime/'
file_name = 'Details.xlsx'
encoding = 'ISO-8859-1'
target_col = []
drop_col = ['Street', 'xStreet', 'Customer No.', 'Customer', 'Customer Address', 'Customer Address 1', 'Customer Address 3', 'Customer_City', 'Customer_State', 'Customer_zip']
# Read all the data
dfs = read_data(folder_path, file_name, years, encoding, target_col, drop_col, file_type='xlsx')
# # test
# years = list(range(2012, 2013))
# dfs2 = read_data(folder_path, file_name, years, encoding, target_col, drop_col, file_type='xlsx')
Read data from: ../data/overtime/2012_Details.xlsx
Read data from: ../data/overtime/2013_Details.xlsx
Read data from: ../data/overtime/2014_Details.xlsx
Read data from: ../data/overtime/2015_Details.xlsx
Read data from: ../data/overtime/2016_Details.xlsx
Read data from: ../data/overtime/2017_Details.xlsx
Read data from: ../data/overtime/2018_Details.xlsx
Read data from: ../data/overtime/2019_Details.xlsx
Read data from: ../data/overtime/2020_Details.xlsx Successfully read data from all files.
In [3]:
process_hours_columns(dfs, 'Hours\nWorked')
df = pd.concat(dfs, ignore_index=True)
print(df.sample(5))
print(df.columns.tolist())
Job No. Emp. ID Employee Rank Location \
68067 33667.0 102373.0 MYLETT,CHRISTOPHER 9 25 UNION ST
407571 NaN 10728.0 MILIEN,ROCHEFORT 9 NaN
180503 10768.0 11710.0 LOCKHEAD,KEVIN F 9 BENNINGTON ST
49669 92020.0 9704.0 BLICKER,CHARLES R. 9 368 DORCHESTER ST
679338 NaN 106695.0 SAINTFORT,FRANTZ 9 NaN
Detail\nDate Start\nTime End \nTime Hours\nWorked \
68067 2012-10-12 00:00:00 2345 215 2.5
407571 2015-06-04 00:00:00 700 1530 8.5
180503 2013-07-15 00:00:00 1000 1500 5.0
49669 2012-07-19 00:00:00 1600 1700 1.0
679338 2017-10-10 00:00:00 730 1530 8.0
Hours\nPaid Type Address City/State/Zip YEAR \
68067 4 S 25 UNION ST. BOSTON, MA 02108 2012
407571 9 Z NaN NaN 2015
180503 8 Z P.O. BOX 220801 DORCHESTER, MA 02122-0004 2013
49669 4 Z P.O. BOX 220801 DORCHESTER, MA 02122-0004 2012
679338 8 C NaN NaN 2017
Tracking_No c
68067 NaN NaN
407571 1395141.0 NaN
180503 NaN NaN
49669 NaN NaN
679338 NaN 1803973.0
['Job No.', 'Emp. ID', 'Employee', 'Rank', 'Location', 'Detail\nDate', 'Start\nTime', 'End \nTime', 'Hours\nWorked', 'Hours\nPaid', 'Type', 'Address', 'City/State/Zip', 'YEAR', 'Tracking_No', 'c']
In [4]:
print(df.dtypes)
Job No. float64 Emp. ID float64 Employee object Rank int64 Location object Detail\nDate object Start\nTime int64 End \nTime int64 Hours\nWorked float64 Hours\nPaid int64 Type object Address object City/State/Zip object YEAR int64 Tracking_No float64 c float64 dtype: object
In [5]:
# Step 1: Compute the ratio
df['Ratio'] = df['Hours\nWorked'] / df['Hours\nPaid']
# Step 2: Group by Rank and calculate the mean ratio
rank_avg = df.groupby('Rank')['Ratio'].mean().reset_index()
# (Optional) Sort by average ratio for cleaner display
rank_avg = rank_avg.sort_values('Ratio', ascending=False)
# Step 3: Plot one bar per Rank
fig = go.Figure()
# Add a single bar trace
fig.add_trace(go.Bar(
x=rank_avg['Rank'], # X-axis: Rank
y=rank_avg['Ratio'], # Y-axis: Average Ratio
marker=dict(
color=rank_avg['Ratio'], # Use Ratio value to shade the bars
colorscale='Viridis' # Viridis color scale
)
))
# Update layout properly
fig.update_layout(
title='Average Worked-to-Paid Overtime Ratio by Rank', # Title
xaxis_title='Rank', # X-axis label
yaxis_title='Average WRKDHRS / OTHOURS Ratio', # Y-axis label
xaxis_tickangle=0, # Keep x labels straight
plot_bgcolor='white', # White background
margin=dict(l=20, r=20, t=80, b=80),
yaxis=dict(
range=[0, 1.1], # y-axis from 0 to 1.1
showgrid=True, # THIS is how you enable grid lines
gridcolor='lightgrey' # Optional: make the grid lines light grey
),
)
# Show the figure
fig.show()
In [6]:
df.sample(5)[['Employee', 'Ratio', 'YEAR']]
Out[6]:
| Employee | Ratio | YEAR | |
|---|---|---|---|
| 139542 | HOBIN,JOSEPH | 0.944444 | 2013 |
| 744750 | BUTLER,BOBBIE | 0.812500 | 2018 |
| 234038 | SPILLANE,MATTHEW | 1.000000 | 2013 |
| 879554 | CONLEY,KENNETH M | 1.000000 | 2019 |
| 952487 | GARCIA,ELVIS | 0.562500 | 2020 |
In [7]:
# Create the figure
fig = go.Figure()
# Create figure
fig = go.Figure()
# Add histogram with fine bins
fig.add_trace(go.Histogram(
x=df["Ratio"],
xbins=dict(
start=0,
end=1,
size=0.1
),
marker_color='steelblue',
marker_line_color='black',
marker_line_width=1
))
# Update layout
fig.update_layout(
title="Distribution of Ratios of Overtime Paid Hours / Worked Hours (2012–2021)",
xaxis_title="Ratio of Overtime Paid Hours / Worked Hours",
yaxis_title="Frequency",
plot_bgcolor='white',
bargap=0.2,
margin=dict(l=20, r=20, t=80, b=80),
yaxis=dict(showgrid=True, gridcolor='lightgrey'),
xaxis=dict(range=[0, 1])
)
# Show figure
fig.show()